library('tidyverse')
library('e1071')
library('caret')
library('pROC')
library('rsample')
Modeling for Default
Modeling default for loan applications with Support Vector Machines
Approach
I’m focusing on modeling for DEFAULT
using Support Vector Machines.
A suport vector machine could be a good approach due to the nature of the problem and data:
- Many features
- Capturing some non-linear relationships
- Robustness to overfitting
- Binary classification
Prep
Let’s load a few libraries:
We’ll load the cleaned, balanced, training data:
<- 'D:/All Repos/home-credit-default-risk-group/data/application_train_smote.csv'
path
<- data.table::fread(path) |>
data as.data.frame() |>
mutate(DEFAULT = factor(DEFAULT, levels = c("N", "Y")))
glimpse(data)
Rows: 555,256
Columns: 181
$ CASH_LOAN.N <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ CASH_LOAN.Y <dbl> 1, 1, 1, 1, 1, 1, 1,…
$ GENDER_MALE.N <dbl> 0, 1, 1, 1, 0, 1, 1,…
$ GENDER_MALE.Y <dbl> 1, 0, 0, 0, 1, 0, 0,…
$ FLAG_OWN_CAR.N <dbl> 0, 0, 1, 1, 0, 1, 1,…
$ FLAG_OWN_CAR.Y <dbl> 1, 1, 0, 0, 1, 0, 0,…
$ FLAG_OWN_REALTY.N <dbl> 1, 0, 0, 0, 1, 1, 0,…
$ FLAG_OWN_REALTY.Y <dbl> 0, 1, 1, 1, 0, 0, 1,…
$ CNT_CHILDREN <dbl> 0, 1, 0, 0, 0, 1, 0,…
$ AMT_INCOME_TOTAL <dbl> 90000, 252000, 31500…
$ AMT_CREDIT <dbl> 263686.5, 675000.0, …
$ AMT_ANNUITY <dbl> 19237.5, 53460.0, 52…
$ AMT_GOODS_PRICE <dbl> 238500, 675000, 9000…
$ NAME_TYPE_SUITE.Children <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ NAME_TYPE_SUITE.Family <dbl> 0, 1, 0, 0, 0, 0, 1,…
$ NAME_TYPE_SUITE.Group.of.people <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ NAME_TYPE_SUITE.Other_A <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ NAME_TYPE_SUITE.Other_B <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ NAME_TYPE_SUITE.Spouse..partner <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ NAME_TYPE_SUITE.Unaccompanied <dbl> 1, 0, 1, 1, 1, 1, 0,…
$ NAME_INCOME_TYPE.Businessman <int> 0, 0, 0, 0, 0, 0, 0,…
$ NAME_INCOME_TYPE.Commercial.associate <dbl> 0, 0, 1, 1, 0, 0, 0,…
$ NAME_INCOME_TYPE.Maternity.leave <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ NAME_INCOME_TYPE.Pensioner <dbl> 0, 0, 0, 0, 0, 0, 1,…
$ NAME_INCOME_TYPE.State.servant <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ NAME_INCOME_TYPE.Student <int> 0, 0, 0, 0, 0, 0, 0,…
$ NAME_INCOME_TYPE.Unemployed <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ NAME_INCOME_TYPE.Working <dbl> 1, 1, 0, 0, 1, 1, 0,…
$ NAME_EDUCATION_TYPE.Academic.degree <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ NAME_EDUCATION_TYPE.Higher.education <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ NAME_EDUCATION_TYPE.Incomplete.higher <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ NAME_EDUCATION_TYPE.Lower.secondary <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ NAME_EDUCATION_TYPE.Secondary...secondary.special <dbl> 1, 1, 1, 1, 1, 1, 1,…
$ NAME_FAMILY_STATUS.Civil.marriage <dbl> 0, 0, 0, 0, 0, 1, 0,…
$ NAME_FAMILY_STATUS.Married <dbl> 1, 1, 1, 0, 1, 0, 0,…
$ NAME_FAMILY_STATUS.Separated <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ NAME_FAMILY_STATUS.Single...not.married <dbl> 0, 0, 0, 1, 0, 0, 0,…
$ NAME_FAMILY_STATUS.Widow <dbl> 0, 0, 0, 0, 0, 0, 1,…
$ NAME_HOUSING_TYPE.Co.op.apartment <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ NAME_HOUSING_TYPE.House...apartment <dbl> 0, 1, 1, 0, 1, 1, 1,…
$ NAME_HOUSING_TYPE.Municipal.apartment <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ NAME_HOUSING_TYPE.Office.apartment <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ NAME_HOUSING_TYPE.Rented.apartment <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ NAME_HOUSING_TYPE.With.parents <dbl> 1, 0, 0, 1, 0, 0, 0,…
$ REGION_POPULATION_RELATIVE <dbl> 0.002134, 0.014520, …
$ DAYS_BIRTH <dbl> -10042, -10464, -172…
$ DAYS_EMPLOYED <dbl> -1598, -2245, -195, …
$ DAYS_REGISTRATION <dbl> -167, -169, -11387, …
$ DAYS_ID_PUBLISH <dbl> -2529, -3111, -180, …
$ OWN_CAR_AGE <dbl> 19, 15, 0, 0, 4, 0, …
$ FLAG_EMP_PHONE.N <dbl> 0, 0, 0, 0, 0, 0, 1,…
$ FLAG_EMP_PHONE.Y <dbl> 1, 1, 1, 1, 1, 1, 0,…
$ FLAG_WORK_PHONE.N <dbl> 1, 1, 0, 1, 1, 1, 1,…
$ FLAG_WORK_PHONE.Y <dbl> 0, 0, 1, 0, 0, 0, 0,…
$ FLAG_CONT_MOBILE.N <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ FLAG_CONT_MOBILE.Y <dbl> 1, 1, 1, 1, 1, 1, 1,…
$ FLAG_PHONE.N <dbl> 1, 1, 0, 1, 1, 1, 0,…
$ FLAG_PHONE.Y <dbl> 0, 0, 1, 0, 0, 0, 1,…
$ FLAG_EMAIL.N <dbl> 1, 1, 1, 1, 1, 1, 1,…
$ FLAG_EMAIL.Y <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ OCCUPATION_TYPE.Accountants <dbl> 0, 0, 1, 0, 0, 0, 0,…
$ OCCUPATION_TYPE.Cleaning.staff <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ OCCUPATION_TYPE.Cooking.staff <dbl> 0, 0, 0, 0, 1, 0, 0,…
$ OCCUPATION_TYPE.Core.staff <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ OCCUPATION_TYPE.Drivers <dbl> 1, 0, 0, 0, 0, 0, 0,…
$ OCCUPATION_TYPE.High.skill.tech.staff <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ OCCUPATION_TYPE.HR.staff <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ OCCUPATION_TYPE.IT.staff <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ OCCUPATION_TYPE.Laborers <dbl> 0, 0, 0, 0, 0, 1, 0,…
$ OCCUPATION_TYPE.Low.skill.Laborers <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ OCCUPATION_TYPE.Managers <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ OCCUPATION_TYPE.Medicine.staff <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ OCCUPATION_TYPE.Private.service.staff <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ OCCUPATION_TYPE.Realty.agents <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ OCCUPATION_TYPE.Sales.staff <dbl> 0, 1, 0, 0, 0, 0, 0,…
$ OCCUPATION_TYPE.Secretaries <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ OCCUPATION_TYPE.Security.staff <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ OCCUPATION_TYPE.Waiters.barmen.staff <dbl> 0, 0, 0, 1, 0, 0, 0,…
$ OCCUPATION_TYPE.XNA <dbl> 0, 0, 0, 0, 0, 0, 1,…
$ CNT_FAM_MEMBERS <dbl> 2, 3, 2, 1, 2, 3, 1,…
$ REGION_RATING_CLIENT <dbl> 3, 2, 1, 2, 2, 2, 2,…
$ REGION_RATING_CLIENT_W_CITY <dbl> 3, 2, 1, 2, 2, 2, 2,…
$ WEEKDAY_APPR_PROCESS_START.FRIDAY <dbl> 1, 1, 0, 1, 0, 0, 0,…
$ WEEKDAY_APPR_PROCESS_START.MONDAY <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ WEEKDAY_APPR_PROCESS_START.SATURDAY <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ WEEKDAY_APPR_PROCESS_START.SUNDAY <dbl> 0, 0, 1, 0, 0, 0, 0,…
$ WEEKDAY_APPR_PROCESS_START.THURSDAY <dbl> 0, 0, 0, 0, 0, 1, 1,…
$ WEEKDAY_APPR_PROCESS_START.TUESDAY <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ WEEKDAY_APPR_PROCESS_START.WEDNESDAY <dbl> 0, 0, 0, 0, 1, 0, 0,…
$ HOUR_APPR_PROCESS_START <dbl> 8, 12, 10, 13, 15, 1…
$ REG_REGION_NOT_LIVE_REGION.N <dbl> 1, 1, 1, 1, 1, 1, 1,…
$ REG_REGION_NOT_LIVE_REGION.Y <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ REG_REGION_NOT_WORK_REGION.N <dbl> 1, 1, 1, 1, 1, 1, 1,…
$ REG_REGION_NOT_WORK_REGION.Y <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ LIVE_REGION_NOT_WORK_REGION.N <dbl> 1, 1, 1, 1, 1, 1, 1,…
$ LIVE_REGION_NOT_WORK_REGION.Y <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ REG_CITY_NOT_LIVE_CITY.N <dbl> 1, 1, 1, 0, 0, 1, 1,…
$ REG_CITY_NOT_LIVE_CITY.Y <dbl> 0, 0, 0, 1, 1, 0, 0,…
$ REG_CITY_NOT_WORK_CITY.N <dbl> 1, 1, 1, 0, 0, 1, 1,…
$ REG_CITY_NOT_WORK_CITY.Y <dbl> 0, 0, 0, 1, 1, 0, 0,…
$ LIVE_CITY_NOT_WORK_CITY.N <dbl> 1, 1, 1, 1, 1, 1, 1,…
$ LIVE_CITY_NOT_WORK_CITY.Y <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ ORGANIZATION_TYPE.Advertising <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ ORGANIZATION_TYPE.Agriculture <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ ORGANIZATION_TYPE.Bank <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ ORGANIZATION_TYPE.Business.Entity.Type.1 <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ ORGANIZATION_TYPE.Business.Entity.Type.2 <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ ORGANIZATION_TYPE.Business.Entity.Type.3 <dbl> 1, 0, 1, 0, 0, 0, 0,…
$ ORGANIZATION_TYPE.Cleaning <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ ORGANIZATION_TYPE.Construction <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ ORGANIZATION_TYPE.Culture <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ ORGANIZATION_TYPE.Electricity <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ ORGANIZATION_TYPE.Emergency <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ ORGANIZATION_TYPE.Government <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ ORGANIZATION_TYPE.Hotel <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ ORGANIZATION_TYPE.Housing <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ ORGANIZATION_TYPE.Industry..type.1 <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ ORGANIZATION_TYPE.Industry..type.10 <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ ORGANIZATION_TYPE.Industry..type.11 <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ ORGANIZATION_TYPE.Industry..type.12 <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ ORGANIZATION_TYPE.Industry..type.13 <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ ORGANIZATION_TYPE.Industry..type.2 <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ ORGANIZATION_TYPE.Industry..type.3 <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ ORGANIZATION_TYPE.Industry..type.4 <dbl> 0, 1, 0, 0, 0, 0, 0,…
$ ORGANIZATION_TYPE.Industry..type.5 <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ ORGANIZATION_TYPE.Industry..type.6 <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ ORGANIZATION_TYPE.Industry..type.7 <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ ORGANIZATION_TYPE.Industry..type.8 <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ ORGANIZATION_TYPE.Industry..type.9 <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ ORGANIZATION_TYPE.Insurance <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ ORGANIZATION_TYPE.Kindergarten <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ ORGANIZATION_TYPE.Legal.Services <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ ORGANIZATION_TYPE.Medicine <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ ORGANIZATION_TYPE.Military <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ ORGANIZATION_TYPE.Mobile <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ ORGANIZATION_TYPE.Other <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ ORGANIZATION_TYPE.Police <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ ORGANIZATION_TYPE.Postal <dbl> 0, 0, 0, 0, 0, 1, 0,…
$ ORGANIZATION_TYPE.Realtor <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ ORGANIZATION_TYPE.Religion <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ ORGANIZATION_TYPE.Restaurant <dbl> 0, 0, 0, 1, 1, 0, 0,…
$ ORGANIZATION_TYPE.School <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ ORGANIZATION_TYPE.Security <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ ORGANIZATION_TYPE.Security.Ministries <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ ORGANIZATION_TYPE.Self.employed <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ ORGANIZATION_TYPE.Services <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ ORGANIZATION_TYPE.Telecom <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ ORGANIZATION_TYPE.Trade..type.1 <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ ORGANIZATION_TYPE.Trade..type.2 <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ ORGANIZATION_TYPE.Trade..type.3 <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ ORGANIZATION_TYPE.Trade..type.4 <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ ORGANIZATION_TYPE.Trade..type.5 <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ ORGANIZATION_TYPE.Trade..type.6 <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ ORGANIZATION_TYPE.Trade..type.7 <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ ORGANIZATION_TYPE.Transport..type.1 <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ ORGANIZATION_TYPE.Transport..type.2 <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ ORGANIZATION_TYPE.Transport..type.3 <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ ORGANIZATION_TYPE.Transport..type.4 <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ ORGANIZATION_TYPE.University <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ ORGANIZATION_TYPE.XNA <dbl> 0, 0, 0, 0, 0, 0, 1,…
$ EXT_SOURCE_1 <dbl> 0.5021294, 0.4415526…
$ EXT_SOURCE_2 <dbl> 0.149016484, 0.48337…
$ EXT_SOURCE_3 <dbl> 0.3996756, 0.5406545…
$ OBS_30_CNT_SOCIAL_CIRCLE <dbl> 2, 0, 0, 1, 0, 1, 0,…
$ DEF_30_CNT_SOCIAL_CIRCLE <dbl> 1, 0, 0, 0, 0, 1, 0,…
$ OBS_60_CNT_SOCIAL_CIRCLE <dbl> 2, 0, 0, 1, 0, 1, 0,…
$ DEF_60_CNT_SOCIAL_CIRCLE <dbl> 1, 0, 0, 0, 0, 1, 0,…
$ DAYS_LAST_PHONE_CHANGE <dbl> -889, -1043, 0, -251…
$ AMT_REQ_CREDIT_BUREAU_HOUR <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ AMT_REQ_CREDIT_BUREAU_DAY <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ AMT_REQ_CREDIT_BUREAU_WEEK <dbl> 0, 0, 1, 0, 0, 0, 0,…
$ AMT_REQ_CREDIT_BUREAU_MON <dbl> 0, 0, 0, 0, 0, 0, 1,…
$ AMT_REQ_CREDIT_BUREAU_QRT <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ AMT_REQ_CREDIT_BUREAU_YEAR <dbl> 2, 1, 1, 1, 4, 1, 1,…
$ IMPUTED_EXT1.N <dbl> 0, 1, 1, 1, 1, 1, 0,…
$ IMPUTED_EXT1.Y <dbl> 1, 0, 0, 0, 0, 0, 1,…
$ IMPUTED_EXT2.N <dbl> 1, 1, 1, 1, 1, 1, 1,…
$ IMPUTED_EXT2.Y <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ IMPUTED_EXT3.N <dbl> 1, 1, 1, 1, 1, 1, 1,…
$ IMPUTED_EXT3.Y <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ DEFAULT <fct> Y, Y, Y, Y, Y, Y, Y,…
All features in this file may conceivably be helpful in predicting DEFAULT
(originally named TARGET
), with exception of SK_ID_CURR
. Additionally, we need to remove all categorical variables that have just 1 class (essentially, constants).
We do have a class imbalance to work out:
We’ll work with a sample of the data since we have over 300K records.
set.seed(2015)
# Splits
<- createDataPartition(data$DEFAULT, p = 0.8, list = FALSE)
partition_idx <- data[partition_idx,]
full_train <- data[-partition_idx,]
full_test
# 2% sample of the data
<- sample_n(
train_sampl ceiling(nrow(full_train) * 0.02)
full_train,
)<- sample_n(
test_sampl ceiling(nrow(full_test) * 0.05)
full_test, )
<- sapply(train_sampl, function(x) length(unique(x)))
unique <- names(unique[unique == 1])
remove_cols
<- train_sampl |> select(-all_of(remove_cols))
train_sampl <- test_sampl |> select(-all_of(remove_cols)) test_sampl
All features in this file may conceivably be helpful in predicting DEFAULT
(originally named TARGET
) so we shouldn’t need to subset any of them.
Support Vector machine
We also want to setup our cross validation requiring folds, a tuning grid, and a loop:
<- rsample::vfold_cv(train_sampl) train_folds
<- expand.grid(
fitGrid sigma = c(0.01, 0.05),
C = c(0.01, 0.5, 1, 5, 10),
AUC = 0
)
Basic Modeling
We can now setup a cross-validated training of DEFAULT
with the SMOTE data:
<- function(grid, folds) {
runCvSVM for(i in 1:nrow(folds)) {
# Values for modeling
<- folds$splits[i][[1]]
fold <- grid$sigma[i]
sigma <- grid$C[i]
C
# Training
<- training(fold)
train <- sapply(train, function(x) length(unique(x)))
unique <- names(unique[unique == 1])
remove_cols
<- train |> select(-all_of(remove_cols))
train <- e1071::svm(DEFAULT ~ ., data = train, sigma = sigma, C = C)
modl
# Testing
<- testing(fold) |> select(-all_of(remove_cols))
test <- predict(modl, newdata = test, type = 'response')
pred
# Evaluation
<- roc(ifelse(test$DEFAULT == "Y", 1, 0), ifelse(pred == "Y", 1, 0))
roc_obj <- auc(roc_obj)
auc
# Save measure
$AUC[i] = auc
gridprint(paste0("AUC from fold ", i, ":"))
print(auc)
}return(grid)
}
<- runCvSVM(fitGrid, train_folds) results
[1] "AUC from fold 1:"
Area under the curve: 0.8189
[1] "AUC from fold 2:"
Area under the curve: 0.837
[1] "AUC from fold 3:"
Area under the curve: 0.8375
[1] "AUC from fold 4:"
Area under the curve: 0.8222
[1] "AUC from fold 5:"
Area under the curve: 0.8365
[1] "AUC from fold 6:"
Area under the curve: 0.8246
[1] "AUC from fold 7:"
Area under the curve: 0.8206
[1] "AUC from fold 8:"
Area under the curve: 0.8539
[1] "AUC from fold 9:"
Area under the curve: 0.826
[1] "AUC from fold 10:"
Area under the curve: 0.823
results
sigma C AUC
1 0.01 0.01 0.8189310
2 0.05 0.01 0.8369932
3 0.01 0.50 0.8375020
4 0.05 0.50 0.8221758
5 0.01 1.00 0.8365004
6 0.05 1.00 0.8245600
7 0.01 5.00 0.8205697
8 0.05 5.00 0.8539352
9 0.01 10.00 0.8259827
10 0.05 10.00 0.8229776
The AUC values are looking really great. At around 0.83, that’s even better than we were estimating with a balanced, penalized regression model.
Let’s now generate full metrics from the best model we’ve got here: sigma = 0.05
and C = 5.00
:
<- e1071::svm(DEFAULT ~ ., data = train_sampl, sigma = 0.05, C = 5.00) tuned_mod
<- predict(tuned_mod, newdata = test_sampl, type = 'response') pred_probs
Now, we need to find the ideal threshold for classification of the probabilities:
<- roc(ifelse(test_sampl$DEFAULT == "Y", 1, 0), ifelse(pred_probs == "Y", 1, 0))
roc_obj <- roc_obj$sensitivities + roc_obj$specificities - 1
youden_index
<- which.max(youden_index)
optimal_index <- roc_obj$thresholds[optimal_index]
optimal_threshold optimal_threshold
[1] 0.5
We can now use this optimal threshold for the confusion matrix:
<- confusionMatrix(
mat3 factor(ifelse(pred_probs == "Y", 1, 0), levels = c(1,0)),
factor(ifelse(test_sampl$DEFAULT == "Y", 1, 0), levels = c(1,0))
)
mat3
Confusion Matrix and Statistics
Reference
Prediction 1 0
1 2045 252
0 671 2585
Accuracy : 0.8338
95% CI : (0.8237, 0.8435)
No Information Rate : 0.5109
P-Value [Acc > NIR] : < 2.2e-16
Kappa : 0.6663
Mcnemar's Test P-Value : < 2.2e-16
Sensitivity : 0.7529
Specificity : 0.9112
Pos Pred Value : 0.8903
Neg Pred Value : 0.7939
Prevalence : 0.4891
Detection Rate : 0.3683
Detection Prevalence : 0.4137
Balanced Accuracy : 0.8321
'Positive' Class : 1
Let’s also grab the AUC value:
<- auc(roc_obj)
auc3 auc3
Area under the curve: 0.8321
Let’s compile all these metrics into a dataframe and save the results:
<- c(
performance c("model" = "Support Vector Machine"),
c("hyperparameters" = paste(
paste("Sigma:", 0.1),
paste("C:", 10.0),
collapse = ", "
)), $overall[c("Accuracy")],
mat3$byClass[c("Precision", "Recall")],
mat3c("AUC" = auc3)
)
performance
model hyperparameters Accuracy
"Support Vector Machine" "Sigma: 0.1 C: 10" "0.833783540428597"
Precision Recall AUC
"0.890291684806269" "0.752945508100147" "0.832059641607352"
data.frame(as.list(performance)) |>
write.csv('models/support-vector-machine/model-results.csv', row.names = FALSE)
Conclusion
Despite all of my research, I can’t figure out why the cross validation AUC is so poor but the AUC with the testing data is so good. It has to be an issue with the cross validation using caret.